Installing Required Packages

# install required packages/library
install.packages("matlib")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("rsample")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("corrplot")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("MASS")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Warning: package 'MASS' is not available for this version of R
## 'MASS' version 7.3-65 is in the repositories but depends on R (>= 4.4.0)
## 'MASS' version 7.3-65 is in the repositories but depends on R (>= 4.6)
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
install.packages("gridExtra")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("knitr")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("matrixStats")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("ggdensity")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

Loading Libraries

# import required packages/libraries
library(matlib)     # for matrix operations
library(ggplot2)    # for data visualization
library(rsample)    # for data splitting
library(corrplot)   # for correlation visualization
## corrplot 0.95 loaded
library(MASS)       # for statistical functions
library(gridExtra)  # for arranging plots
library(dplyr)      # for data manipulation
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)      # for tables
library(doParallel) # For parallel processing
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(foreach)    # For parallel processing (used with doParallel)
library(ggdensity)  # For enhanced 2D density plots/contours

Loading Datasets

# load features dataset (independent variable)
features <- as.matrix(read.csv(file="data/features.csv", header=FALSE))
colnames(features) <- c("x1", "x3", "x4", "x5")

# load target dataset (dependent variable)
target <- as.matrix(read.csv(file="data/target.csv", header=FALSE))
colnames(target) <- c("X2")

# load time series dataset
time <- as.matrix((read.csv(file="data/timeseries.csv", header=FALSE)))
colnames(time) <- c("T1")

# display the first few rows of each dataset
cat("Features Dataset: \n")
## Features Dataset:
head(features)
##         x1    x3      x4    x5
## [1,]  8.34 40.77 1010.84 90.01
## [2,] 23.64 58.49 1011.40 74.20
## [3,] 29.74 56.90 1007.15 41.91
## [4,] 19.07 49.69 1007.22 76.79
## [5,] 11.80 40.66 1017.13 97.20
## [6,] 13.97 39.16 1016.05 84.60
cat("\nTarget Dataset: \n")
## 
## Target Dataset:
head(target)
##          X2
## [1,] 480.48
## [2,] 445.75
## [3,] 438.76
## [4,] 453.09
## [5,] 464.43
## [6,] 470.96
cat("\nTimeseries Dataset: \n")
## 
## Timeseries Dataset:
head(features)
##         x1    x3      x4    x5
## [1,]  8.34 40.77 1010.84 90.01
## [2,] 23.64 58.49 1011.40 74.20
## [3,] 29.74 56.90 1007.15 41.91
## [4,] 19.07 49.69 1007.22 76.79
## [5,] 11.80 40.66 1017.13 97.20
## [6,] 13.97 39.16 1016.05 84.60

Summary Statistics

# create a combined dataframe
combined_df <- data.frame(
  Temperature = features[,"x1"],
  Ambient_Pressure = features[,"x3"],
  Relative_Humidity = features[,"x4"],
  Exhaust_Vacuum = features[,"x5"],
  Energy_Output = target[,"X2"]
)

# summary statistics
summary_stats <- summary(combined_df)
print(summary_stats)
##   Temperature    Ambient_Pressure Relative_Humidity Exhaust_Vacuum  
##  Min.   : 1.81   Min.   :25.36    Min.   : 992.9    Min.   : 25.56  
##  1st Qu.:13.51   1st Qu.:41.74    1st Qu.:1009.1    1st Qu.: 63.33  
##  Median :20.34   Median :52.08    Median :1012.9    Median : 74.97  
##  Mean   :19.65   Mean   :54.31    Mean   :1013.3    Mean   : 73.31  
##  3rd Qu.:25.72   3rd Qu.:66.54    3rd Qu.:1017.3    3rd Qu.: 84.83  
##  Max.   :37.11   Max.   :81.56    Max.   :1033.3    Max.   :100.16  
##  Energy_Output  
##  Min.   :420.3  
##  1st Qu.:439.8  
##  Median :451.6  
##  Mean   :454.4  
##  3rd Qu.:468.4  
##  Max.   :495.8

Task 1: Preliminary Data Analysis

Time Series Analysis

Time Series Analysis (Input Signal)

# Function to plot time series with optional rolling average, and sampling at regular intervals
plot_time_series_enhanced <- function(time_series, var_name, xlab = "Time", ylab, 
                                     col = "black", 
                                     rolling_avg = FALSE, rolling_window = 100, 
                                     sample_interval = NULL) {
  
  time_series_to_plot <- time_series
  time_indices_to_plot <- seq_along(time_series)
  rolling_mean_to_plot <- NULL
  
  # Rolling Average Calculation
  if (rolling_avg) {
    rolling_mean <- stats::filter(time_series, rep(1/rolling_window, rolling_window), sides = 2)
    
    # Trim NA values
    valid_indices <- rolling_window: (length(time_series) - rolling_window + 1)
    rolling_mean <- rolling_mean[valid_indices]
    time_series_to_plot <- time_series_to_plot[valid_indices]
    time_indices_to_plot <- time_indices_to_plot[valid_indices]
    rolling_mean_to_plot <- rolling_mean
  }
  
  # Sampling at Intervals
  if (!is.null(sample_interval)) {
    sampled_indices <- seq(1, length(time_series_to_plot), by = sample_interval)
    time_series_to_plot <- time_series_to_plot[sampled_indices]
    time_indices_to_plot <- time_indices_to_plot[sampled_indices]
    if (rolling_avg) {
      rolling_mean_to_plot <- rolling_mean_to_plot[sampled_indices]
    }
  }
  
  # Plotting
  plot(time_indices_to_plot, time_series_to_plot,
       type = "l",
       main = var_name,
       xlab = xlab,
       ylab = ylab,
       col = col,
       cex.lab = 1.2,
       cex.main = 1.4
  )
  
  # Add Rolling Average Line (if requested)
  if (rolling_avg) {
    lines(time_indices_to_plot, rolling_mean_to_plot, col = "red", lwd = 2)
    legend("topright", legend = c("Original (or Sampled)", paste(rolling_window, "-Point Avg")),
           col = c(col, "red"), lty = 1, lwd = c(1, 2), cex = 0.8)
  }
}

# Convert features to time series objects
features.ts <- ts(features, start = c(min(time), max(time)), frequency = 1)

# Plotting with the new sampling interval
plot_time_series_enhanced(features.ts[, "x1"], "Temperature", ylab = "Temperature (°C)", col = "darkred", rolling_avg = TRUE, rolling_window = 100, sample_interval = 90)

plot_time_series_enhanced(features.ts[, "x3"], "Ambient Pressure", ylab = "Pressure (millibar)", col = "darkblue", rolling_avg = TRUE, rolling_window = 100, sample_interval = 90)

plot_time_series_enhanced(features.ts[, "x4"], "Relative Humidity", ylab = "Humidity (%)", col = "darkgreen", rolling_avg = TRUE, rolling_window = 100, sample_interval = 90)

plot_time_series_enhanced(features.ts[, "x5"], "Exhaust Vacuum", ylab = "Vacuum (cm Hg)", col = "darkorange", rolling_avg = TRUE, rolling_window = 100, sample_interval = 90)

Time Series Analysis (Output Signal)

# Function to plot target time series with optional rolling average, and sampling at regular intervals
plot_target_time_series_enhanced <- function(time_series, var_name, xlab = "Time", ylab,
                                            col = "purple", lwd = 1.5,
                                            rolling_avg = FALSE, rolling_window = 100,
                                            sample_interval = NULL) {

  time_series_to_plot <- time_series
  time_indices_to_plot <- seq_along(time_series)
  rolling_mean_to_plot <- NULL

  # Rolling Average Calculation
  if (rolling_avg) {
    rolling_mean <- stats::filter(time_series, rep(1/rolling_window, rolling_window), sides = 2)

    # Trim NA values
    valid_indices <- rolling_window: (length(time_series) - rolling_window + 1)
    rolling_mean <- rolling_mean[valid_indices]
    time_series_to_plot <- time_series_to_plot[valid_indices]
    time_indices_to_plot <- time_indices_to_plot[valid_indices]
    rolling_mean_to_plot <- rolling_mean
  }

  # Sampling at Intervals
  if (!is.null(sample_interval)) {
    sampled_indices <- seq(1, length(time_series_to_plot), by = sample_interval)
    time_series_to_plot <- time_series_to_plot[sampled_indices]
    time_indices_to_plot <- time_indices_to_plot[sampled_indices]
    if (rolling_avg) {
      rolling_mean_to_plot <- rolling_mean_to_plot[sampled_indices]
    }
  }

  # Plotting
  plot(time_indices_to_plot, time_series_to_plot,
       type = "l",
       main = var_name,
       xlab = xlab,
       ylab = ylab,
       col = col,
       lwd = lwd,
       cex.lab = 1.2,
       cex.main = 1.4
  )

  # Add Rolling Average Line 
  if (rolling_avg) {
    lines(time_indices_to_plot, rolling_mean_to_plot, col = "red", lwd = 2)
    legend("topright", legend = c("Original (or Sampled)", paste(rolling_window, "-Point Avg")),
           col = c(col, "red"), lty = 1, lwd = c(1, 2), cex = 0.8)
  }
}

# Convert target to time series object
target.ts <- ts(target, start = c(min(time), max(time)), frequency = 1)

# Plotting the target time series with options
plot_target_time_series_enhanced(target.ts, "Net Hourly Electrical Energy Output",
                                 xlab = "Time", ylab = "Energy Output (MW)",
                                 col = "purple", lwd = 1.5,
                                 rolling_avg = TRUE, rolling_window = 100,
                                 sample_interval = 90)

Distribution of Each Signal

# function to create enhanced histogram with density plot and normal reference
plot_enhanced_histogram <- function(data, title, xlab, col = "lightblue", density_col = "darkblue") {
  # create histogram
  hist(data, freq = FALSE, main = title, xlab = xlab, col = col,
       border = "white")

  # add density line
  lines(density(data), lwd = 2, col = density_col)
}

# histogram for temperature (x1)
plot_enhanced_histogram(
  features[, "x1"],
  "Distribution of Temperature (x1)",
  "Ambient temperature (°C)",
  col = "lightpink",
  density_col = "deeppink"
)

# histogram for ambient pressure (x3)
plot_enhanced_histogram(
  features[, "x3"],
  "Distribution of Ambient Pressure (x3)",
  "Atmospheric pressure (millibar)",
  col = "lightcyan",
  density_col = "darkcyan"
)

# histogram for relative humidity (x4)
plot_enhanced_histogram(
  features[, "x4"],
  "Distribution of Relative Humidity (x4)",
  "Humidity level (%)",
  col = "lemonchiffon",
  density_col = "gold"
)

# histogram for exhaust vacuum (x5)
plot_enhanced_histogram(
  features[, "x5"],
  "Distribution of Exhaust Vacuum (x5)",
  "Vacuum (cm Hg)",
  col = "lavender",
  density_col = "mediumpurple"
)

# histogram for net hourly electrical energy output (X2)
plot_enhanced_histogram(
  target[, "X2"],
  "Distribution of Net Hourly Electrical Energy Output (X2)",
  "Net hourly electrical energy output (MW)",
  col = "honeydew",
  density_col = "forestgreen"
)

Correlation Analysis and Scatter Plots

Correlation Matrix

# combine features and target for correlation analysis
combined <- cbind(features, target)

# calculate correlation matrix
correlation <- cor(combined)
print(correlation)
##            x1         x3          x4          x5         X2
## x1  1.0000000  0.8441067 -0.50754934 -0.54253465 -0.9481285
## x3  0.8441067  1.0000000 -0.41350216 -0.31218728 -0.8697803
## x4 -0.5075493 -0.4135022  1.00000000  0.09957432  0.5184290
## x5 -0.5425347 -0.3121873  0.09957432  1.00000000  0.3897941
## X2 -0.9481285 -0.8697803  0.51842903  0.38979410  1.0000000
# create enhanced correlation plot
corrplot(correlation, method = "color", type = "upper", 
         addCoef.col = "black", number.cex = 0.7,
         tl.col = "black", tl.srt = 45,
         title = "Correlation Matrix of Power Plant Variables",
         mar = c(0, 0, 1, 0))

Scatter Plots

# Define a palette of 4 colors (one per feature)
base_colors <- c("steelblue", "tomato", "forestgreen", "orchid")
# Optionally add a bit of transparency
plot_colors <- sapply(base_colors, function(col) adjustcolor(col, alpha.f = 0.7))

# List of columns and their labels
plots <- list(
  x1 = c("Ambient Temp (°C)",       "Net Energy Output (MW)"),
  x3 = c("Ambient Pressure (mbar)", "Net Energy Output (MW)"),
  x4 = c("Relative Humidity (%)",   "Net Energy Output (MW)"),
  x5 = c("Exhaust Vacuum (cm Hg)",  "Net Energy Output (MW)")
)

# Reset to single plot
par(mfrow = c(1, 1), mar = c(4, 4, 2, 1))

# Loop over each feature -> plot(feature, target) with its own color
for (name in names(plots)) {
  labs <- plots[[name]]
  x    <- features[, name]
  y    <- target[, "X2"]
  
  plot(
    x, y,
    xlab  = labs[1],
    ylab  = labs[2],
    main  = paste(labs[1], "vs", labs[2]),
    pch   = 16,
    cex   = 0.6,
    col   = plot_colors[which(names(plots) == name)]  
  )
}

Task 2: Regression Modeling

Task 2.1: Model Parameters

# OLS helper
fit_ls <- function(X, y) {
  # Add checks for singular matrix for robust error handling, though ginv helps.
  if (det(t(X) %*% X) == 0) {
    warning("Matrix (X'X) is singular. OLS coefficients might not be unique.")
  }
  ginv(t(X) %*% X) %*% t(X) %*% y
}

# function scales columns  and handles cases where a column might be constant
scale_features_matrix <- function(mat) {
  if (is.null(mat) || ncol(mat) == 0) {
    return(matrix(nrow = nrow(mat), ncol = 0)) # Return empty matrix if input is empty
  }
  scaled_mat <- scale(mat)
  scaled_mat[is.na(scaled_mat)] <- 0
  return(scaled_mat)
}

# Build each model and fit 
theta_estimates <- list()

# Model 1: y = θ1·x4 + θ2·x3² + θbias
# Create raw feature matrix for this model
X1_raw_features <- cbind(x4 = features[,"x4"],
                         x3_squared = features[,"x3"]^2)
# Scale these raw features
X1_scaled_features <- scale_features_matrix(X1_raw_features)
# Add the intercept column to the scaled features
X1 <- cbind("(Intercept)" = 1, X1_scaled_features)
# Fit the OLS model
theta_estimates[["Model1"]] <- fit_ls(X1, target)
rownames(theta_estimates[["Model1"]]) <- colnames(X1)


# Model 2: y = θ1·x4 + θ2·x3² + θ3·x5 + θbias
X2_raw_features <- cbind(x4 = features[,"x4"],
                         x3_squared = features[,"x3"]^2,
                         x5 = features[,"x5"])
X2_scaled_features <- scale_features_matrix(X2_raw_features)
X2 <- cbind("(Intercept)" = 1, X2_scaled_features)
theta_estimates[["Model2"]] <- fit_ls(X2, target)
rownames(theta_estimates[["Model2"]]) <- colnames(X2)


# Model 3: y = θ1·x3 + θ2·x4 + θ3·x5³ + θbias
X3_raw_features <- cbind(x3 = features[,"x3"],
                         x4 = features[,"x4"],
                         x5_cubed = features[,"x5"]^3)
X3_scaled_features <- scale_features_matrix(X3_raw_features)
X3 <- cbind("(Intercept)" = 1, X3_scaled_features)
theta_estimates[["Model3"]] <- fit_ls(X3, target)
rownames(theta_estimates[["Model3"]]) <- colnames(X3)


# Model 4: y = θ1·x4 + θ2·x3² + θ3·x5³ + θbias
X4_raw_features <- cbind(x4 = features[,"x4"],
                         x3_squared = features[,"x3"]^2,
                         x5_cubed = features[,"x5"]^3)
X4_scaled_features <- scale_features_matrix(X4_raw_features)
X4 <- cbind("(Intercept)" = 1, X4_scaled_features)
theta_estimates[["Model4"]] <- fit_ls(X4, target)
rownames(theta_estimates[["Model4"]]) <- colnames(X4)


# Model 5: y = θ1·x4 + θ2·x1² + θ3·x3² + θbias
X5_raw_features <- cbind(x4 = features[,"x4"],
                         x1_squared = features[,"x1"]^2,
                         x3_squared = features[,"x3"]^2)
X5_scaled_features <- scale_features_matrix(X5_raw_features)
X5 <- cbind("(Intercept)" = 1, X5_scaled_features)
theta_estimates[["Model5"]] <- fit_ls(X5, target)
rownames(theta_estimates[["Model5"]]) <- colnames(X5)


# Print results 
for (model in names(theta_estimates)) {
  cat("\n", model, "θ̂ =\n")
  print(theta_estimates[[model]])
}
## 
##  Model1 θ̂ =
##                     X2
## (Intercept) 454.365009
## x4            3.348278
## x3_squared  -13.211452
## 
##  Model2 θ̂ =
##                     X2
## (Intercept) 454.365009
## x4            3.432657
## x3_squared  -12.406622
## x5            2.517326
## 
##  Model3 θ̂ =
##                     X2
## (Intercept) 454.365009
## x3          -12.722943
## x4            3.402683
## x5_cubed      2.315840
## 
##  Model4 θ̂ =
##                     X2
## (Intercept) 454.365009
## x4            3.487220
## x3_squared  -12.402038
## x5_cubed      2.487015
## 
##  Model5 θ̂ =
##                     X2
## (Intercept) 454.365009
## x4            1.349107
## x1_squared  -10.605331
## x3_squared   -5.173124

Task 2.2: Model Residual Sum of Squared Error (RSS)

# RSS Calculation
compute_rss_matrix <- function(X, y, theta) {
  y_pred <- tryCatch(X %*% theta, error = function(e) NULL)
  if (is.null(y_pred) || any(is.na(y_pred))) {
      return(NA)
  }
  sum((y - y_pred)^2)
}

rss_estimates <- list()

# Model 1: y = θ1·x4 + θ2·x3² + θbias
theta_1 <- theta_estimates[["Model1"]] 
if (!all(is.na(theta_1))) {
  rss_estimates[["Model1"]] <- compute_rss_matrix(X1, target, theta_1)
} else {
  rss_estimates[["Model1"]] <- NA
}


# Model 2: y = θ1·x4 + θ2·x3² + θ3·x5 + θbias
theta_2 <- theta_estimates[["Model2"]]
if (!all(is.na(theta_2))) {
  rss_estimates[["Model2"]] <- compute_rss_matrix(X2, target, theta_2)
} else {
  rss_estimates[["Model2"]] <- NA
}


# Model 3: y = θ1·x3 + θ2·x4 + θ3·x5³ + θbias
theta_3 <- theta_estimates[["Model3"]]
if (!all(is.na(theta_3))) {
  rss_estimates[["Model3"]] <- compute_rss_matrix(X3, target, theta_3)
} else {
  rss_estimates[["Model3"]] <- NA
}


# Model 4: y = θ1·x4 + θ2·x3² + θ3·x5³ + θbias
theta_4 <- theta_estimates[["Model4"]]
if (!all(is.na(theta_4))) {
  rss_estimates[["Model4"]] <- compute_rss_matrix(X4, target, theta_4)
} else {
  rss_estimates[["Model4"]] <- NA
}


# Model 5: y = θ1·x4 + θ2·x1² + θ3·x3² + θbias
theta_5 <- theta_estimates[["Model5"]]
if (!all(is.na(theta_5))) {
  rss_estimates[["Model5"]] <- compute_rss_matrix(X5, target, theta_5)
} else {
  rss_estimates[["Model5"]] <- NA
}


cat("\nResidual Sum of Squares (RSS) \n")
## 
## Residual Sum of Squares (RSS)
for (model_name in names(rss_estimates)) {
  cat("\n", model_name, "RSS =", rss_estimates[[model_name]], "\n")
}
## 
##  Model1 RSS = 657248.2 
## 
##  Model2 RSS = 602347.1 
## 
##  Model3 RSS = 547491.6 
## 
##  Model4 RSS = 603630.7 
## 
##  Model5 RSS = 365625

Task 2.3: Log-likelihood and Variance Calculation

# Constants for Log-Likelihood Calculation 
n <- nrow(target) # Number of data samples
ln_2pi <- log(2 * pi) # Pre-calculate ln(2Ï€)

# Compute Log-Likelihood and Variance for each model 
log_likelihood_values <- list()
variance_estimates <- list() # List to store calculated variances (sigma_hat_squared)

# Iterate through the RSS values
for (model_name in names(rss_estimates)) {
  current_rss <- rss_estimates[[model_name]]

  # Only compute log-likelihood if RSS is a valid number
  if (!is.na(current_rss) && is.numeric(current_rss)) {
    # Calculate sigma_hat_squared 
    sigma_hat_squared <- current_rss / (n - 1)
    variance_estimates[[model_name]] <- sigma_hat_squared # Store the variance

    # Check for non-positive or zero sigma_hat_squared before log
    if (sigma_hat_squared <= 0) {
      warning(paste("Model", model_name, ": sigma_hat_squared is non-positive or zero. Log-likelihood set to NA."))
      log_likelihood_values[[model_name]] <- NA
    } else {
      # Compute the log-likelihood using the given formula
      log_likelihood <- - (n / 2) * ln_2pi - (n / 2) * log(sigma_hat_squared) - (1 / (2 * sigma_hat_squared)) * current_rss
      log_likelihood_values[[model_name]] <- log_likelihood
    }
  } else {
    log_likelihood_values[[model_name]] <- NA # Set log-likelihood to NA if RSS was invalid
    variance_estimates[[model_name]] <- NA # Set variance to NA if RSS was invalid
  }
}

cat("\n Log-Likelihood and Variance: \n")
## 
##  Log-Likelihood and Variance:
for (model_name in names(log_likelihood_values)) {
  cat("\n", model_name, "\n")
  cat("  Variance (σ̂²):", variance_estimates[[model_name]], "\n")
  cat("  Log-Likelihood:", log_likelihood_values[[model_name]], "\n")
}
## 
##  Model1 
##   Variance (σ̂²): 68.69951 
##   Log-Likelihood: -33810.99 
## 
##  Model2 
##   Variance (σ̂²): 62.96091 
##   Log-Likelihood: -33393.69 
## 
##  Model3 
##   Variance (σ̂²): 57.2271 
##   Log-Likelihood: -32936.88 
## 
##  Model4 
##   Variance (σ̂²): 63.09509 
##   Log-Likelihood: -33403.88 
## 
##  Model5 
##   Variance (σ̂²): 38.21731 
##   Log-Likelihood: -31005.4

Task 2.4: Compute AIC and BIC

# Constants for AIC/BIC Calculation 
n <- nrow(target) # Number of data samples

# 'k' (number of estimated parameters) for each model
k_values <- list(
  Model1 = 3, # θ1·x4 + θ2·x3² + θbias
  Model2 = 4, # θ1·x4 + θ2·x3² + θ3·x5 + θbias
  Model3 = 4, # θ1·x3 + θ2·x4 + θ3·x5³ + θbias
  Model4 = 4, # θ1·x4 + θ2·x3² + θ3·x5³ + θbias
  Model5 = 4  # θ1·x4 + θ2·x1² + θ3·x3² + θbias
)

# Compute AIC and BIC for each model
aic_values <- list()
bic_values <- list()

# Iterate through the log-likelihood values
for (model_name in names(log_likelihood_values)) {
  current_log_likelihood <- log_likelihood_values[[model_name]]
  k_param <- k_values[[model_name]] # Get the 'k' value for the current model

  # Only compute AIC/BIC if log-likelihood is a valid number and k is defined
  if (!is.na(current_log_likelihood) && is.numeric(current_log_likelihood) &&
      !is.null(k_param) && !is.na(k_param) && is.numeric(k_param)) {

    # Compute AIC: AIC = 2k - 2 ln p(D|θ̂)
    aic <- 2 * k_param - 2 * current_log_likelihood
    aic_values[[model_name]] <- aic

    # Compute BIC: BIC = k * ln(n) - 2 ln p(D|θ̂)
    bic <- k_param * log(n) - 2 * current_log_likelihood
    bic_values[[model_name]] <- bic

  } else {
    aic_values[[model_name]] <- NA # Set to NA if inputs were invalid
    bic_values[[model_name]] <- NA # Set to NA if inputs were invalid
  }
}

cat("\n Akaike Information Criterion (AIC) & Bayesian Information Criterion (BIC): \n")
## 
##  Akaike Information Criterion (AIC) & Bayesian Information Criterion (BIC):
for (model_name in names(aic_values)) {
  cat("\n", model_name, "\n")
  cat("  AIC:", aic_values[[model_name]], "\n")
  cat("  BIC:", bic_values[[model_name]], "\n")
}
## 
##  Model1 
##   AIC: 67627.98 
##   BIC: 67649.48 
## 
##  Model2 
##   AIC: 66795.38 
##   BIC: 66824.05 
## 
##  Model3 
##   AIC: 65881.77 
##   BIC: 65910.43 
## 
##  Model4 
##   AIC: 66815.75 
##   BIC: 66844.42 
## 
##  Model5 
##   AIC: 62018.79 
##   BIC: 62047.46

Task 2.5: Distribution of Model Prediction Errors

# Model definitions to reconstruct X matrices for calculating residuals.
model_x_builders <- list(
  Model1 = function(features, scale_func) {
    X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2)
    X_scaled <- scale_func(X_raw)
    cbind("(Intercept)" = 1, X_scaled)
  },
  Model2 = function(features, scale_func) {
    X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2, x5 = features[,"x5"])
    X_scaled <- scale_func(X_raw)
    cbind("(Intercept)" = 1, X_scaled)
  },
  Model3 = function(features, scale_func) {
    X_raw <- cbind(x3 = features[,"x3"], x4 = features[,"x4"], x5_cubed = features[,"x5"]^3)
    X_scaled <- scale_func(X_raw)
    cbind("(Intercept)" = 1, X_scaled)
  },
  Model4 = function(features, scale_func) {
    X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2, x5_cubed = features[,"x5"]^3)
    X_scaled <- scale_func(X_raw)
    cbind("(Intercept)" = 1, X_scaled)
  },
  Model5 = function(features, scale_func) {
    X_raw <- cbind(x4 = features[,"x4"], x1_squared = features[,"x1"]^2, x3_squared = features[,"x3"]^2)
    X_scaled <- scale_func(X_raw)
    cbind("(Intercept)" = 1, X_scaled)
  }
)

# Loop through each model to calculate residuals and generate Q-Q plots
for (model_name in names(model_x_builders)) {
  # Reconstruct the design matrix X using the builder function and global helpers
  X <- model_x_builders[[model_name]](features, scale_features_matrix)

  # Retrieve the estimated parameters (theta) for the current model
  theta <- theta_estimates[[model_name]]

  # Check if theta is valid and has correct dimensions before calculating residuals
  if (is.null(theta) || any(is.na(theta)) || length(theta) != ncol(X)) {
    warning(paste("Cannot compute residuals for", model_name, ": Invalid, missing, or dimension-mismatched theta estimates."))
    next
  }
  
  # Ensure theta is a matrix for matrix multiplication
  if (is.vector(theta)) {
    theta <- matrix(theta, ncol = 1)
  }

  # Calculate predicted values (y_hat)
  y_pred <- X %*% theta

  # Calculate residuals (prediction errors): y - y_hat
  residuals <- target - y_pred

  # Flatten residuals to a vector for qqnorm function
  residuals_vec <- as.vector(residuals)

  # Check for NA/NaN/Inf in residuals before plotting
  if (any(is.na(residuals_vec)) || any(is.infinite(residuals_vec))) {
    warning(paste("Residuals for", model_name, "contain NA/Inf values. Q-Q plot may not be generated properly."))
    next
  }

  # Plot Q-Q plot
  qqnorm(residuals_vec, main = paste("Q-Q Plot of Residuals for", model_name))
  qqline(residuals_vec, col = "red")
}

Task 2.6: Selection of Best Regression Model

# Compile all model evaluation metrics into a comprehensive table 
model_comparison <- data.frame(
  Model = names(theta_estimates), # Using names from theta_estimates
  Parameters = sapply(theta_estimates, function(theta) length(theta[!is.na(theta)])), 
  RSS = sapply(rss_estimates, function(rss) ifelse(is.null(rss) || length(rss) == 0 || is.na(rss), NA, rss)), 
  LogLikelihood = sapply(log_likelihood_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)), 
  AIC = sapply(aic_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)), 
  BIC = sapply(bic_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)), 
  Variance = sapply(variance_estimates, function(val) ifelse(is.null(val) || is.na(val), NA, val)) 
)

# Display the comprehensive table 
cat("\n Comprehensive Comparison of All Models \n")
## 
##  Comprehensive Comparison of All Models
print(kable(model_comparison,
            caption = "Comprehensive Comparison of All Models",
            digits = c(0, 0, 2, 2, 2, 2, 4)))
## 
## 
## Table: Comprehensive Comparison of All Models
## 
## |       |Model  | Parameters|      RSS| LogLikelihood|      AIC|      BIC| Variance|
## |:------|:------|----------:|--------:|-------------:|--------:|--------:|--------:|
## |Model1 |Model1 |          3| 657248.2|     -33810.99| 67627.98| 67649.48|  68.6995|
## |Model2 |Model2 |          4| 602347.1|     -33393.69| 66795.38| 66824.05|  62.9609|
## |Model3 |Model3 |          4| 547491.6|     -32936.88| 65881.77| 65910.43|  57.2271|
## |Model4 |Model4 |          4| 603630.7|     -33403.88| 66815.75| 66844.42|  63.0951|
## |Model5 |Model5 |          4| 365625.0|     -31005.40| 62018.79| 62047.46|  38.2173|
cat("\n")
# Rank models based on different criteria 
rank_rss <- rank(model_comparison$RSS, na.last = "keep", ties.method = "min")
rank_aic <- rank(model_comparison$AIC, na.last = "keep", ties.method = "min")
rank_bic <- rank(model_comparison$BIC, na.last = "keep", ties.method = "min")
rank_loglik <- rank(-model_comparison$LogLikelihood, na.last = "keep", ties.method = "min") # Negative because higher is better

ranking_table <- data.frame(
  Model = model_comparison$Model,
  RSS_Rank = rank_rss,
  AIC_Rank = rank_aic,
  BIC_Rank = rank_bic,
  LogLikelihood_Rank = rank_loglik
)

# Calculate Overall_Rank by summing individual ranks, considering only valid ranks
ranking_table$Overall_Rank <- rowSums(ranking_table[, c("RSS_Rank", "AIC_Rank", "BIC_Rank", "LogLikelihood_Rank")], na.rm = TRUE)

# Sort by overall rank (lower sum is better)
ranking_table <- ranking_table[order(ranking_table$Overall_Rank),]

# Display the ranking table 
cat("\n Model Ranking Based on Different Criteria (Lower Sum of Ranks is Better) \n")
## 
##  Model Ranking Based on Different Criteria (Lower Sum of Ranks is Better)
print(kable(ranking_table,
            caption = "Model Ranking Based on Different Criteria (Lower is Better)"))
## 
## 
## Table: Model Ranking Based on Different Criteria (Lower is Better)
## 
## |   |Model  | RSS_Rank| AIC_Rank| BIC_Rank| LogLikelihood_Rank| Overall_Rank|
## |:--|:------|--------:|--------:|--------:|------------------:|------------:|
## |5  |Model5 |        1|        1|        1|                  1|            4|
## |3  |Model3 |        2|        2|        2|                  2|            8|
## |2  |Model2 |        3|        3|        3|                  3|           12|
## |4  |Model4 |        4|        4|        4|                  4|           16|
## |1  |Model1 |        5|        5|        5|                  5|           20|
cat("\n") 
# Determine the best model based on individual criteria 
best_model_aic_index <- which.min(model_comparison$AIC)
best_model_aic_name <- model_comparison$Model[best_model_aic_index]

best_model_bic_index <- which.min(model_comparison$BIC)
best_model_bic_name <- model_comparison$Model[best_model_bic_index]

best_model_loglik_index <- which.max(model_comparison$LogLikelihood)
best_model_loglik_name <- model_comparison$Model[best_model_loglik_index]

best_model_rss_index <- which.min(model_comparison$RSS)
best_model_rss_name <- model_comparison$Model[best_model_rss_index]

cat(paste("Based on AIC, the best model is:", best_model_aic_name, "\n"))
## Based on AIC, the best model is: Model5
cat(paste("Based on BIC, the best model is:", best_model_bic_name, "\n"))
## Based on BIC, the best model is: Model5
cat(paste("Based on Log-Likelihood, the best model is:", best_model_loglik_name, "\n"))
## Based on Log-Likelihood, the best model is: Model5
cat(paste("Based on RSS, the best model is:", best_model_rss_name, "\n"))
## Based on RSS, the best model is: Model5

Task 2.7: Train-Test Split

set.seed(123) # Set seed for reproducibility of data splitting

# Split the input (features) and output (target) dataset
sample_size <- nrow(features)
train_indices <- sample(sample_size, size = round(0.7 * sample_size))
test_indices <- setdiff(1:sample_size, train_indices)

# Create training and testing datasets
train_features <- features[train_indices, ]
train_target <- target[train_indices, ]
test_features <- features[test_indices, ]
y_test <- target[test_indices, ]

cat(paste0("Data split: Training samples = ", nrow(train_features), ", Testing samples = ", nrow(test_features), "\n"))
## Data split: Training samples = 6698, Testing samples = 2870
# Use the best model and estimate its parameters using the training dataset
best_model_name <- best_model_aic_name

# Reconstruct the design matrix (X_train) for the best model using training featuresn.
if (!exists("model_x_builders") || is.null(model_x_builders[[best_model_name]])) {
  stop("model_x_builders or the builder for the best model is not available. Please ensure Task 2.5 code is run first.")
}

X_train_model <- model_x_builders[[best_model_name]](train_features, scale_features_matrix)

# Estimate model parameters use the training dataset
theta_train <- fit_ls(X_train_model, train_target)
rownames(theta_train) <- colnames(X_train_model)

cat(paste0("\nEstimated parameters for ", best_model_name, " using training data:\n"))
## 
## Estimated parameters for Model5 using training data:
print(theta_train)
##                   [,1]
## (Intercept) 454.354339
## x4            1.280183
## x1_squared  -10.646216
## x3_squared   -5.149130
# Compute the model's output/prediction on the testing data
X_test_model <- model_x_builders[[best_model_name]](test_features, scale_features_matrix)

# Compute predictions on the testing data
y_pred_test <- X_test_model %*% theta_train


# Calculate Test Error Metrics
test_rss <- sum((y_test - y_pred_test)^2)
test_rmse <- sqrt(mean((y_test - y_pred_test)^2))
test_mae <- mean(abs(y_test - y_pred_test))
test_r_squared <- 1 - sum((y_test - y_pred_test)^2) / sum((y_test - mean(y_test))^2)

# Display test metrics
test_metrics <- data.frame(
  Metric = c("RSS", "RMSE", "MAE", "R-squared"),
  Value = c(test_rss, test_rmse, test_mae, test_r_squared)
)

print(kable(test_metrics, caption = paste0("Model Performance on Test Data for ", best_model_name), format = "pipe", digits = 3))
## 
## 
## Table: Model Performance on Test Data for Model5
## 
## |Metric    |      Value|
## |:---------|----------:|
## |RSS       | 106713.984|
## |RMSE      |      6.098|
## |MAE       |      4.859|
## |R-squared |      0.873|
cat("\n")
# Compute 95% (model prediction) confidence intervals
n_train <- nrow(X_train_model)
k_params <- ncol(X_train_model) # Number of estimated parameters including intercept

# Calculate residuals from the training fit to estimate sigma_hat_squared
residuals_train <- train_target - (X_train_model %*% theta_train)
rss_train <- sum(residuals_train^2)

# Estimate variance of residuals (sigma_hat_squared) from training data
sigma_hat_squared_train <- rss_train / (n_train - k_params)
sigma_hat_train <- sqrt(sigma_hat_squared_train)

# Calculate the inverse of (X_train_model'X_train_model)
XTX_inv_train <- tryCatch(
  solve(t(X_train_model) %*% X_train_model),
  error = function(e) {
    warning("Matrix (X'X) is singular for training data. Using ginv from MASS for inverse.")
    ginv(t(X_train_model) %*% X_train_model)
  }
)

# Degrees of freedom for t-distribution
df <- n_train - k_params
if (df <= 0) {
  warning("Degrees of freedom are zero or negative. Cannot compute confidence intervals.")
  lower_ci <- rep(NA, nrow(y_pred_test))
  upper_ci <- rep(NA, nrow(y_pred_test))
} else {
  # T-value for 95% confidence (alpha = 0.05, two-tailed)
  t_critical <- qt(0.975, df = df)

  # Calculate prediction intervals for each test point
  lower_ci <- numeric(nrow(X_test_model))
  upper_ci <- numeric(nrow(X_test_model))

  for (i in 1:nrow(X_test_model)) {
    x0_t <- matrix(X_test_model[i, ], nrow = 1) # Transpose to row vector
    # Variance of the prediction for a new observation
    pred_variance <- sigma_hat_squared_train * (1 + x0_t %*% XTX_inv_train %*% t(x0_t))
    pred_sd <- sqrt(pred_variance)

    margin_of_error <- t_critical * pred_sd

    lower_ci[i] <- y_pred_test[i] - margin_of_error
    upper_ci[i] <- y_pred_test[i] + margin_of_error
  }
}

# Convert to vectors for plotting consistency in ggplot
y_test_vec <- as.vector(y_test)
y_pred_test_vec <- as.vector(y_pred_test)

# Prepare data for ggplot
plot_data <- data.frame(
  index = seq_along(y_test_vec),   # Use seq_along on the vector
  Actual = y_test_vec,             # Use the vector directly
  Predicted = y_pred_test_vec,     # Use the vector directly
  LowerCI = lower_ci,
  UpperCI = upper_ci
)

# Define how many rows to display 
num_display_rows <- min(10, nrow(plot_data)) # Display up to 10 rows

# Select the sample rows and display them using kable for nice formatting
sample_display_df <- plot_data[1:num_display_rows, c("Actual", "Predicted", "LowerCI", "UpperCI")]
print(kable(sample_display_df,
            caption = paste0("Sample of ", num_display_rows, " Test Predictions with 95% Confidence Intervals for ", best_model_name),
            format = "pipe",
            digits = 3)) # Adjust digits for desired precision
## 
## 
## Table: Sample of 10 Test Predictions with 95% Confidence Intervals for Model5
## 
## | Actual| Predicted| LowerCI| UpperCI|
## |------:|---------:|-------:|-------:|
## | 470.96|   469.773| 457.578| 481.967|
## | 451.41|   454.666| 442.470| 466.862|
## | 473.74|   473.549| 461.355| 485.744|
## | 441.76|   440.817| 428.622| 453.011|
## | 474.71|   476.158| 463.961| 488.354|
## | 487.69|   478.450| 466.253| 490.647|
## | 452.16|   451.566| 439.373| 463.760|
## | 483.26|   476.192| 463.997| 488.387|
## | 433.59|   430.476| 418.278| 442.675|
## | 435.14|   443.392| 431.194| 455.591|
cat("\n")
# Generate ggplot
ggplot(plot_data, aes(x = index)) +
  geom_ribbon(aes(ymin = LowerCI, ymax = UpperCI, fill = "95% CI"), alpha = 0.2, na.rm = TRUE) + # Shaded CI
  geom_line(aes(y = Predicted, color = "Predicted"), linetype = "dashed", size = 0.8, na.rm = TRUE) + # Predicted line
  geom_point(aes(y = Actual, color = "Actual"), size = 1.0, alpha = 0.7, na.rm = TRUE) + # Actual points
  scale_color_manual(name = "colour",
                     values = c("Actual" = "blue", "Predicted" = "red")) +
  scale_fill_manual(name = "fill", values = c("95% CI" = "grey")) +
  labs(
    title = paste0(best_model_name, " : Actual vs. Predicted with 95% CI (Test Set)"), # Match title style
    x = "index", # Match x-axis label style
    y = "Net hourly output" # Match y-axis label style
  ) +
  theme_minimal() + # Use a minimal theme
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 9)
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Task 3: Approximate Bayesian Computation (ABC)

Posterior Distribution and Parameter Estimation

# Specify Model 5 as the best model
best_model_name_abc <- "Model5"
cat(paste0("Using '", best_model_name_abc, "' as the best model for ABC.\n"))
## Using 'Model5' as the best model for ABC.
# Ensure the model_x_builder for Model 5 is available
if (!exists("model_x_builders") || is.null(model_x_builders[[best_model_name_abc]])) {
  stop("model_x_builders or the builder for Model5 is not available. Please ensure Task 2.5 code is run first.")
}

# Ensures that the ABC is based on a training dataset
set.seed(123) # for reproducibility of data split
data_full <- as.data.frame(cbind(features, target))
data_split <- initial_split(data_full, prop = 0.70) # 70% for training

features_train <- as.matrix(training(data_split)[, colnames(features)])
target_train <- as.matrix(training(data_split)[, colnames(target)])

features_test <- as.matrix(testing(data_split)[, colnames(features)])
target_test <- as.matrix(testing(data_split)[, colnames(target)])

cat(paste0("\nData split into training (", nrow(features_train), " samples) and testing (", nrow(features_test), " samples).\n"))
## 
## Data split into training (6697 samples) and testing (2871 samples).
# Construct the design matrix for Model 5 using the TRAINING dataset
X_train_full_design_matrix <- model_x_builders[[best_model_name_abc]](features_train, scale_features_matrix)

# Check for valid column names in X_train_full_design_matrix
if (is.null(colnames(X_train_full_design_matrix)) || any(is.na(colnames(X_train_full_design_matrix))) || any(colnames(X_train_full_design_matrix) == "")) {
  stop("The design matrix (X_train_full_design_matrix) for Model5 contains invalid column names (NULL, NA, or empty string). Please ensure valid, non-empty column names are generated by model_x_builders.")
}

# Estimate parameters for Model 5 using the TRAINING dataset
theta_estimate_model_5_train <- fit_ls(X_train_full_design_matrix, target_train)
rownames(theta_estimate_model_5_train) <- colnames(X_train_full_design_matrix)

cat(paste0("\nLeast squares estimates for ", best_model_name_abc, " (training dataset):\n"))
## 
## Least squares estimates for Model5 (training dataset):
print(theta_estimate_model_5_train)
##                   [,1]
## (Intercept) 454.356682
## x4            1.279909
## x1_squared  -10.646107
## x3_squared   -5.149565
# Convert to a data frame for easier manipulation and sorting
theta_df <- data.frame(
  Parameter = rownames(theta_estimate_model_5_train),
  Estimate = as.vector(theta_estimate_model_5_train)
)
theta_df$AbsEstimate <- abs(theta_df$Estimate)
theta_df_ordered <- theta_df[order(-theta_df$AbsEstimate), ]

# Select the top 2 parameters with largest absolute values to vary
params_to_vary_info <- theta_df_ordered[1:2, ]
params_to_vary_names <- params_to_vary_info$Parameter

# Explicitly filter out any invalid parameter names
params_to_vary_names <- params_to_vary_names[!is.na(params_to_vary_names) & params_to_vary_names != "" & !is.null(params_to_vary_names)]

# Re-check the length after filtering to ensure at least 2 valid names remain
if (length(params_to_vary_names) < 2) {
  stop("After validation and filtering, fewer than 2 valid parameter names remain for ABC. Please check the model definition and data for issues with parameter names or coefficients.")
}

# Ensure selected parameter names exist in the theta estimate's row names
if (!all(params_to_vary_names %in% rownames(theta_estimate_model_5_train))) {
  stop(paste0("Error: Selected parameters to vary (", paste(params_to_vary_names, collapse = ", "), ") do not exist as row names in the training model's theta estimate. Available names: ", paste(rownames(theta_estimate_model_5_train), collapse = ", "), ". This might indicate an issue with feature matrix column naming or theta estimation."))
}


# The remaining parameters are fixed to their estimated values
if (nrow(theta_df_ordered) > 2) {
  fixed_params_info <- theta_df_ordered[3:nrow(theta_df_ordered), ]
  fixed_params_values <- setNames(fixed_params_info$Estimate, fixed_params_info$Parameter)
} else {
  fixed_params_info <- data.frame(Parameter = character(0), Estimate = numeric(0), AbsEstimate = numeric(0))
  fixed_params_values <- setNames(numeric(0), character(0))
}


cat(paste0("\nParameters to vary in ABC:\n"))
## 
## Parameters to vary in ABC:
print(params_to_vary_info)
##     Parameter  Estimate AbsEstimate
## 1 (Intercept) 454.35668   454.35668
## 3  x1_squared -10.64611    10.64611
cat(paste0("\nFixed parameters (and their estimated values):\n"))
## 
## Fixed parameters (and their estimated values):
print(fixed_params_values)
## x3_squared         x4 
##  -5.149565   1.279909
# Retrieve the LS estimates for the two parameters to be varied for prior centering
prior_mean_param1 <- theta_estimate_model_5_train[params_to_vary_names[1], 1]
prior_mean_param2 <- theta_estimate_model_5_train[params_to_vary_names[2], 1]

# Define uniform prior ranges: centered around LS estimate, with a minimum width
prior_range_half_param1 <- max(5 * abs(prior_mean_param1), 0.5) # Ensures exploration even if LS estimate is small
prior_range_half_param2 <- max(5 * abs(prior_mean_param2), 0.5)

prior_param1_min <- prior_mean_param1 - prior_range_half_param1
prior_param1_max <- prior_mean_param1 + prior_range_half_param1

prior_param2_min <- prior_mean_param2 - prior_range_half_param2
prior_param2_max <- prior_mean_param2 + prior_range_half_param2

priors_abc <- list(
  param1 = list(name = params_to_vary_names[1], type = "uniform", min = prior_param1_min, max = prior_param1_max),
  param2 = list(name = params_to_vary_names[2], type = "uniform", min = prior_param2_min, max = prior_param2_max)
)

# Display the prior parameter names and their uniform ranges
cat("\nPrior parameters (Uniform Distribution Ranges):\n")
## 
## Prior parameters (Uniform Distribution Ranges):
cat(paste0("- ", priors_abc$param1$name, ": [", round(priors_abc$param1$min, 4), ", ", round(priors_abc$param1$max, 4), "]\n"))
## - (Intercept): [-1817.4267, 2726.1401]
cat(paste0("- ", priors_abc$param2$name, ": [", round(priors_abc$param2$min, 4), ", ", round(priors_abc$param2$max, 4), "]\n"))
## - x1_squared: [-63.8766, 42.5844]
# Function to sample a single set of parameters from the defined priors
sample_from_priors <- function() {
  param_samples <- c()
  param_samples[priors_abc$param1$name] <- runif(1, priors_abc$param1$min, priors_abc$param1$max)
  param_samples[priors_abc$param2$name] <- runif(1, priors_abc$param2$min, priors_abc$param2$max)
  return(param_samples)
}

# Calculate the observed summary statistic (RSS) for the OLS model on the TRAINING data.
y_pred_obs_train_model <- X_train_full_design_matrix %*% theta_estimate_model_5_train
residuals_obs_train_model <- target_train - y_pred_obs_train_model
observed_rss_train_model <- sum(residuals_obs_train_model^2)
cat(paste0("\nObserved RSS of ", best_model_name_abc, " on training dataset: ", observed_rss_train_model, "\n"))
## 
## Observed RSS of Model5 on training dataset: 258937.811114784

Implement Rejection ABC Algorithm

# Function to calculate RSS for a given set of simulated parameters.
calculate_simulated_rss <- function(simulated_theta_vector, X_matrix, observed_y) {
  # Compute predictions using the simulated parameters and the design matrix
  y_simulated <- X_matrix %*% simulated_theta_vector

  # Calculate residuals by comparing simulated predictions to the actual observed data
  residuals_simulated <- observed_y - y_simulated

  # Compute the sum of squared residuals
  rss <- sum(residuals_simulated^2)
  return(rss)
}

cat(paste0("Summary statistic chosen: Residual Sum of Squares (RSS).\n"))
## Summary statistic chosen: Residual Sum of Squares (RSS).
cat(paste0("The observed RSS calculated on training data is: ", observed_rss_train_model, "\n"))
## The observed RSS calculated on training data is: 258937.811114784
# Define ABC parameters
n_abc_simulations <- 5000000 # Number of simulations to run for ABC
epsilon <- 1.5 * observed_rss_train_model
cat(paste0("Epsilon (tolerance for acceptance): ", epsilon, "\n"))
## Epsilon (tolerance for acceptance): 388406.716672176
# Initialize list to collect accepted samples
accepted_samples_list <- list()

set.seed(42) # Set seed for reproducibility of random sampling within ABC

# Using a for loop with explicit acceptance based on epsilon
cat(paste0("\nStarting ABC simulations (", n_abc_simulations, " iterations).\n"))
## 
## Starting ABC simulations (5e+06 iterations).
pb <- txtProgressBar(min = 0, max = n_abc_simulations, style = 3) # Progress bar
##   |                                                                              |                                                                      |   0%
# Define all parameter names, including fixed ones, for constructing current_theta_abc
all_param_names <- rownames(theta_estimate_model_5_train)

for (i in 1:n_abc_simulations) {
  # Sample parameters from the prior distribution
  proposed_theta_varying <- sample_from_priors()

  # Create a full parameter vector for the current simulation
  current_theta_abc <- theta_estimate_model_5_train # Start with training LS estimates
  current_theta_abc[params_to_vary_names[1], 1] <- proposed_theta_varying[params_to_vary_names[1]]
  current_theta_abc[params_to_vary_names[2], 1] <- proposed_theta_varying[params_to_vary_names[2]]

  # Calculate the summary statistic for the simulated data using TRAINING data
  simulated_rss <- calculate_simulated_rss(
    simulated_theta_vector = current_theta_abc,
    X_matrix = X_train_full_design_matrix, # Use training design matrix
    observed_y = target_train              # Use training target
  )

  # Rejection step: accept if simulated RSS is less than epsilon
  if (simulated_rss < epsilon) {
    # Store the accepted sample (the two varying parameters and their RSS)
    temp_data_list <- list()
    temp_data_list[[params_to_vary_names[1]]] <- proposed_theta_varying[params_to_vary_names[1]]
    temp_data_list[[params_to_vary_names[2]]] <- proposed_theta_varying[params_to_vary_names[2]]
    temp_data_list[["rss"]] <- simulated_rss
    accepted_samples_list[[length(accepted_samples_list) + 1]] <- as.data.frame(temp_data_list)
  }
  setTxtProgressBar(pb, i) # Update progress bar
}
##   |                                                                              |                                                                      |   1%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |==                                                                    |   4%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |=======                                                               |  11%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |=========                                                             |  14%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  16%  |                                                                              |============                                                          |  17%  |                                                                              |============                                                          |  18%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%  |                                                                              |===============                                                       |  21%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  22%  |                                                                              |================                                                      |  23%  |                                                                              |================                                                      |  24%  |                                                                              |=================                                                     |  24%  |                                                                              |=================                                                     |  25%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |===================                                                   |  28%  |                                                                              |====================                                                  |  28%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |=====================                                                 |  31%  |                                                                              |======================                                                |  31%  |                                                                              |======================                                                |  32%  |                                                                              |=======================                                               |  32%  |                                                                              |=======================                                               |  33%  |                                                                              |=======================                                               |  34%  |                                                                              |========================                                              |  34%  |                                                                              |========================                                              |  35%  |                                                                              |=========================                                             |  35%  |                                                                              |=========================                                             |  36%  |                                                                              |==========================                                            |  36%  |                                                                              |==========================                                            |  37%  |                                                                              |==========================                                            |  38%  |                                                                              |===========================                                           |  38%  |                                                                              |===========================                                           |  39%  |                                                                              |============================                                          |  39%  |                                                                              |============================                                          |  40%  |                                                                              |============================                                          |  41%  |                                                                              |=============================                                         |  41%  |                                                                              |=============================                                         |  42%  |                                                                              |==============================                                        |  42%  |                                                                              |==============================                                        |  43%  |                                                                              |==============================                                        |  44%  |                                                                              |===============================                                       |  44%  |                                                                              |===============================                                       |  45%  |                                                                              |================================                                      |  45%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  46%  |                                                                              |=================================                                     |  47%  |                                                                              |=================================                                     |  48%  |                                                                              |==================================                                    |  48%  |                                                                              |==================================                                    |  49%  |                                                                              |===================================                                   |  49%  |                                                                              |===================================                                   |  50%  |                                                                              |===================================                                   |  51%  |                                                                              |====================================                                  |  51%  |                                                                              |====================================                                  |  52%  |                                                                              |=====================================                                 |  52%  |                                                                              |=====================================                                 |  53%  |                                                                              |=====================================                                 |  54%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |=======================================                               |  55%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  56%  |                                                                              |========================================                              |  57%  |                                                                              |========================================                              |  58%  |                                                                              |=========================================                             |  58%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |==========================================                            |  61%  |                                                                              |===========================================                           |  61%  |                                                                              |===========================================                           |  62%  |                                                                              |============================================                          |  62%  |                                                                              |============================================                          |  63%  |                                                                              |============================================                          |  64%  |                                                                              |=============================================                         |  64%  |                                                                              |=============================================                         |  65%  |                                                                              |==============================================                        |  65%  |                                                                              |==============================================                        |  66%  |                                                                              |===============================================                       |  66%  |                                                                              |===============================================                       |  67%  |                                                                              |===============================================                       |  68%  |                                                                              |================================================                      |  68%  |                                                                              |================================================                      |  69%  |                                                                              |=================================================                     |  69%  |                                                                              |=================================================                     |  70%  |                                                                              |=================================================                     |  71%  |                                                                              |==================================================                    |  71%  |                                                                              |==================================================                    |  72%  |                                                                              |===================================================                   |  72%  |                                                                              |===================================================                   |  73%  |                                                                              |===================================================                   |  74%  |                                                                              |====================================================                  |  74%  |                                                                              |====================================================                  |  75%  |                                                                              |=====================================================                 |  75%  |                                                                              |=====================================================                 |  76%  |                                                                              |======================================================                |  76%  |                                                                              |======================================================                |  77%  |                                                                              |======================================================                |  78%  |                                                                              |=======================================================               |  78%  |                                                                              |=======================================================               |  79%  |                                                                              |========================================================              |  79%  |                                                                              |========================================================              |  80%  |                                                                              |========================================================              |  81%  |                                                                              |=========================================================             |  81%  |                                                                              |=========================================================             |  82%  |                                                                              |==========================================================            |  82%  |                                                                              |==========================================================            |  83%  |                                                                              |==========================================================            |  84%  |                                                                              |===========================================================           |  84%  |                                                                              |===========================================================           |  85%  |                                                                              |============================================================          |  85%  |                                                                              |============================================================          |  86%  |                                                                              |=============================================================         |  86%  |                                                                              |=============================================================         |  87%  |                                                                              |=============================================================         |  88%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |===============================================================       |  91%  |                                                                              |================================================================      |  91%  |                                                                              |================================================================      |  92%  |                                                                              |=================================================================     |  92%  |                                                                              |=================================================================     |  93%  |                                                                              |=================================================================     |  94%  |                                                                              |==================================================================    |  94%  |                                                                              |==================================================================    |  95%  |                                                                              |===================================================================   |  95%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  96%  |                                                                              |====================================================================  |  97%  |                                                                              |====================================================================  |  98%  |                                                                              |===================================================================== |  98%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================|  99%  |                                                                              |======================================================================| 100%
close(pb) # Close progress bar
# Combine all accepted samples from the list into a single data frame
accepted_samples_df <- do.call(rbind, accepted_samples_list)

# Check if any samples were accepted
if (is.null(accepted_samples_df) || nrow(accepted_samples_df) == 0) {
  cat("\nNo samples were accepted. Try increasing n_abc_simulations, widening priors, or increasing epsilon.\n")
  # Create an empty data frame with correct column names if no samples accepted
  posterior_samples <- data.frame(matrix(ncol = 2, nrow = 0))
  colnames(posterior_samples) <- params_to_vary_names
} else {
  cat(paste("\nNumber of accepted samples:", nrow(accepted_samples_df), "out of", n_abc_simulations, "\n"))
  cat(paste("Acceptance rate:", round(nrow(accepted_samples_df) / n_abc_simulations * 100, 2), "%\n"))
  # Store the accepted parameter samples for subsequent plotting
  posterior_samples <- accepted_samples_df[, 1:2]
  colnames(posterior_samples) <- params_to_vary_names
}
## 
## Number of accepted samples: 664 out of 5e+06 
## Acceptance rate: 0.01 %

Marginal and Joint Posterior Distribution

# Check if posterior_samples is empty before plotting
if (is.null(posterior_samples) || nrow(posterior_samples) == 0) {
  cat("\nCannot plot posterior distributions: No samples were accepted in ABC.\n")
} else {
  # Convert the collected posterior samples into a data frame suitable for ggplot
  posterior_df <- as.data.frame(posterior_samples)

  # Plot Marginal Posterior Distribution for the First Varying Parameter
  p_marginal1 <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[1]]])) +
    geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightblue", color = "black") +
    geom_density(color = "darkblue", linewidth = 1) +
    labs(
      title = paste("Marginal Posterior for Parameter:", params_to_vary_names[1]),
      x = paste("Value of", params_to_vary_names[1]),
      y = "Density"
    ) +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold"))

  print(p_marginal1)

  # Plot Marginal Posterior Distribution for the Second Varying Parameter
  p_marginal2 <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[2]]])) +
    geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightgreen", color = "black") +
    geom_density(color = "darkgreen", linewidth = 1) +
    labs(
      title = paste("Marginal Posterior for Parameter:", params_to_vary_names[2]),
      x = paste("Value of", params_to_vary_names[2]),
      y = "Density"
    ) +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold"))

  print(p_marginal2)

  # Plot the Joint Posterior Distribution of the Two Varying Parameters
  p_joint <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[1]]], y = .data[[params_to_vary_names[2]]])) +
    geom_point(alpha = 0.2, color = "purple") +
    geom_density_2d(color = "darkblue", linewidth = 0.8) +
    labs(
      title = paste("Joint Posterior Distribution of", params_to_vary_names[1], "and", params_to_vary_names[2]),
      x = paste("Value of", params_to_vary_names[1]),
      y = paste("Value of", params_to_vary_names[2])
    ) +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold"))

  print(p_joint)
}